Fields for each crash |
|
Date: |
Date of accident, in the format - January 01, 2001 |
Time: |
Local time, in 24 hr. format unless otherwise specified |
Location: |
Location of accident |
Airline/Op: |
Airline or operator of the aircraft |
Flight #: |
Flight number assigned by the aircraft operator |
Route: |
Complete or partial route flown prior to the accident |
AC Type: |
Aircraft type |
Reg: |
ICAO registration of the aircraft |
cn / ln: |
Construction or serial number / Line or fuselage number |
Aboard: |
Total aboard (passengers / crew) |
Fatalities: |
Total fatalities aboard (passengers / crew) |
Ground: |
Total killed on the ground |
Summary: |
Brief description of the accident and cause if known |
We can add the following fields by using the existing fields and the Google Geocoding API.
For the following two fields, we can use the 'location' field of the dataset
For the following four fields, we can use the 'Route' field of the dataset. It is of the format <-source->-<-stop1->-<-stop2->-<-destination->
In [1]:
__author__ = 'shivam_gaur'
import requests
from bs4 import BeautifulSoup
import re
from pymongo import MongoClient
You have to set up your Geocoding API Key before you proceed.
client_key: insert your Google Maps Geocoding API client_key as specified below . Check Geocoding API docs for help.
_URL_: The Google Maps Geocoding API url constant. Must not be changed.
In [2]:
# Global Config Variables
client_key = '&key=<insert_your_39_character_api_key_here>'
_URL_ = 'https://maps.googleapis.com/maps/api/geocode/xml?address='
count = 0
# same helper function as the Flight Crash Data Crawler
def makeBeautifulSoupObject(url):
# Use a `Session` instance to customize how `requests` handles making HTTP requests.
session = requests.Session()
# `mount` a custom adapter that retries failed connections for HTTP and HTTPS requests, in this case- 5 times
session.mount("http://", requests.adapters.HTTPAdapter(max_retries=5))
session.mount("https://", requests.adapters.HTTPAdapter(max_retries=5))
source_code = session.get(url=url)
plain_text = source_code.text.encode('utf8')
soup = BeautifulSoup(plain_text, "lxml")
return soup
In [3]:
# Connecting to Mongo instance
client = MongoClient()
# specify the name of the db in brackets
db = client['aircrashdb']
# specify the name of the collection in brackets
collection = db['crawled_data']
In [4]:
def Request_and_append(address_field):
print (address_field)
print ('\n')
finalurl = _URL_ + address_field + client_key_sohail
soup = makeBeautifulSoupObject(finalurl)
lat_ = soup.find_all('lat')
long_ = soup.findAll('lng')
collection.find_one_and_update({'_id':cur["_id"]},{'$set':{'geolat':lat_[0].string}})
collection.find_one_and_update({'_id':cur["_id"]},{'$set':{'geolong':long_[0].string}})
print (lat_[0].string + ' & ' + long_[0].string + ' - DONE. \n')
After running the code block below, the following fields should've be added to MongoDB Collection:
In [ ]:
# for all the records in the collection
cursor = collection.find()
for cur in cursor:
print(cur["loc"])
if not cur["loc"] =='NULL':
# if the latitude and logitude of aircrash location do not exist
if not "geolat" in cur or not "geolong" in cur:
try:
if not cur['loc'] == 'NULL':
address_field = '+'.join(cur['loc'].split(' '))
Request_and_append(address_field)
count = count + 1
else:
print ("NULL- No Route Field")
except:
print ("COULD NOT PROCESS " + cur['loc'].encode('utf-8'))
new_attempt1 = cur['loc'].encode('utf-8').rpartition(',')[-1]
print ('trying : ' + new_attempt1)
try:
address_field = '+'.join(new_attempt1.encode('utf-8').strip().split(' '))
Request_and_append(address_field)
except:
print ('New attempt has failed as well')
new_attempt2 = cur['loc'].encode('utf-8')
new_attempt2 = re.sub('[^0-9a-zA-Z ]+', '', new_attempt2)
arr = new_attempt2.split()
try:
i=0
for s in arr:
if (s.lower() == 'coast'):
new_attempt_final = (arr [i-1] + ' ' + arr[i]).encode('utf-8')
address_field = '+'.join(new_attempt_final.encode('utf-8').strip().split(' '))
Request_and_append(address_field)
break
elif (s.lower() == 'ocean'):
new_attempt_final = (arr [i-1] + ' ' + arr[i]).encode('utf-8')
address_field = '+'.join(new_attempt_final.encode('utf-8').strip().split(' '))
Request_and_append(address_field)
break
elif (s.lower() == 'sea'):
new_attempt_final = (arr [i-1] + ' ' + arr[i]).encode('utf-8')
address_field = '+'.join(new_attempt_final.encode('utf-8').strip().split(' '))
Request_and_append(address_field)
break
elif (s.lower() == 'off'):
new_attempt_final = (' '.join(arr [i+1:])).encode('utf-8')
address_field = '+'.join(new_attempt_final.encode('utf-8').strip().split(' '))
Request_and_append(address_field)
break
elif (s.lower() == 'persian'): # For persian gulf
new_attempt_final = (arr [i] + ' ' + arr[i+1]).encode('utf-8')
address_field = '+'.join(new_attempt_final.encode('utf-8').strip().split(' '))
Request_and_append(address_field)
break
elif (s.lower() == 'gulf'):
new_attempt_final = (arr [i] + ' ' + arr[i+1]+ ' ' + arr[i+2]).encode('utf-8')
address_field = '+'.join(new_attempt_final.encode('utf-8').strip().split(' '))
Request_and_append(address_field)
break
else:
new_attempt_final = arr [-1]
address_field = '+'.join(new_attempt_final.encode('utf-8').strip().split(' '))
Request_and_append(address_field)
i = i+1
i=0
except:
print ("I AM SORRY, THIS LOCATION CANNOT BE PROCESSED")
else:
# if the latitude and logitude of aircrash location ALREADY EXIST. This is in case this code block is run multiple times.
count = count + 1
print (cur['loc'].encode('utf-8')+' - ALREADY PROCESSED')
else:
print("ROUTE ===== NULL")
print (" TOTAL RECORDS THAT HAVE LATS AND LONGS: " + str(count))
There are a few try and except blocks as the location strings do not follow a nice format.
Some special cases have to be handled,
Open to change. Please email me at shivam6294@gmail.com if you can think of a more elegant solution to handle special cases with the geocoding api
After running the code block below, the following fields should've be added to MongoDB Collection:
In [ ]:
counter = 0
for cur in cursor:
print(cur["route"])
if not cur["route"]=='NULL':
if not "srclat" in cur and not "srclong" in cur or not "deslat" in cur and not "deslong" in cur:
try:
if not cur['route'] == 'NULL':
source_dest = cur["route"].split('-')
source_dest[0] = source_dest[0].strip()
source_dest[-1] = source_dest[-1].strip()
address_field1 = ' '.join(source_dest[0].split(' '))
print (address_field1)
address_field2 = ' '.join(source_dest[-1].split(' '))
print (address_field2)
print ('\n')
finalurl1 = url + address_field1 + client_key_sohail
finalurl2 = url + address_field2 + client_key_sohail
soup1 = makeBeautifulSoupObject(finalurl1)
soup2 = makeBeautifulSoupObject(finalurl2)
srclat = soup1.find_all('lat')
srclong = soup1.findAll('lng')
deslat = soup2.find_all('lat')
deslong = soup2.find_all('lng')
collection.find_one_and_update({'_id':cur["_id"]},{'$set':{'srclat':srclat[0].string}})
collection.find_one_and_update({'_id':cur["_id"]},{'$set':{'srclong':srclong[0].string}})
collection.find_one_and_update({'_id':cur["_id"]},{'$set':{'deslat':deslat[0].string}})
collection.find_one_and_update({'_id':cur["_id"]},{'$set':{'deslong':deslong[0].string}})
print (srclat[0].string)
print (srclong[0].string)
print (deslat[0].string)
print (deslong[0].string)
counter = counter +1
else:
print ("NULL- No Route Field")
except:
print ("COULD NOT PROCESS " + cur['route'].encode('utf-8'))
else:
print ("ALREADY PROCESSED: " + cur['route'].encode('utf-8'))
counter = counter +1
else:
print("ROUTE == NULL")
print ('TOTAL COUNTER: ' + str(counter))
In [ ]:
# Importing the required libraries
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import Element
import xml.etree.ElementTree as etree
import xml.dom.minidom
In [ ]:
root = Element('root')
tree = ElementTree(root)
for cur in cursor:
if "geolat" in cur and "geolong" in cur:
element = Element('element')
root.append(element)
date = Element('date')
date.text= str(cur['date'])
element.append(date)
lat = Element('lat')
lat.text= cur['geolat']
element.append(lat)
long = Element('long')
long.text= cur['geolong']
element.append(long)
fatal = Element('fatal')
if not cur['fatalities_total'] == 'NULL' and not cur['ground'] == 'NULL':
total_fatalities = int(cur['fatalities_total']) + int(cur['ground'])
fatal.text= str(total_fatalities)
elif cur['fatalities_total'] == 'NULL':
fatal.text= cur['ground']
elif cur['ground'] == 'NULL':
fatal.text= cur['fatalities_total']
else:
fatal.text= cur['fatalities_total']
element.append(fatal)
xml = xml.dom.minidom.parseString(etree.tostring(root))
pretty_xml_as_string = xml.toprettyxml()
print (pretty_xml_as_string)
with open(r'C:\Users\admin\Desktop\GE2324\crash_location_data_with_total_fatal.xml', "wb") as f:
f.write(pretty_xml_as_string.encode('utf-8'))
This is what the XML should look like:
In [ ]:
cursor = collection.find()
root = Element('root')
tree = ElementTree(root)
for cur in cursor:
if "srclat" in cur and "srclong" in cur and "deslat" in cur and "deslong" in cur:
element = Element('element')
root.append(element)
srclat = Element('srclat')
srclat.text= cur['srclat']
element.append(srclat)
srclong = Element('srclong')
srclong.text= cur['srclong']
element.append(srclong)
deslat = Element('deslat')
deslat.text= cur['deslat']
element.append(deslat)
deslong = Element('deslong')
deslong.text= cur['deslong']
element.append(deslong)
xml = xml.dom.minidom.parseString(etree.tostring(root))
pretty_xml_as_string = xml.toprettyxml()
print (pretty_xml_as_string)
with open('route_data.xml', "wb") as f:
f.write(pretty_xml_as_string.encode('utf-8'))
This is what the XML should look like: